import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
%%HTML
<script src="require.js"></script>
# 設定pandas顯示格式
pd.options.display.float_format = '{:,.2f}'.format
# 設定plotly呈現
import plotly.io as pio
pio.renderers.default = 'notebook'
# 修改字體
plt.rcParams['font.sans-serif']= ['Microsoft JhengHei']
plt.rcParams['axes.unicode_minus']=False
df = pd.read_csv("train.csv")
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 550068 entries, 0 to 550067 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 User_ID 550068 non-null int64 1 Product_ID 550068 non-null object 2 Gender 550068 non-null object 3 Age 550068 non-null object 4 Occupation 550068 non-null int64 5 City_Category 550068 non-null object 6 Stay_In_Current_City_Years 550068 non-null object 7 Marital_Status 550068 non-null int64 8 Product_Category_1 550068 non-null int64 9 Product_Category_2 376430 non-null float64 10 Product_Category_3 166821 non-null float64 11 Purchase 550068 non-null int64 dtypes: float64(2), int64(5), object(5) memory usage: 50.4+ MB
df.head()
| User_ID | Product_ID | Gender | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | Product_Category_1 | Product_Category_2 | Product_Category_3 | Purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | P00069042 | F | 0-17 | 10 | A | 2 | 0 | 3 | NaN | NaN | 8370 |
| 1 | 1000001 | P00248942 | F | 0-17 | 10 | A | 2 | 0 | 1 | 6.00 | 14.00 | 15200 |
| 2 | 1000001 | P00087842 | F | 0-17 | 10 | A | 2 | 0 | 12 | NaN | NaN | 1422 |
| 3 | 1000001 | P00085442 | F | 0-17 | 10 | A | 2 | 0 | 12 | 14.00 | NaN | 1057 |
| 4 | 1000002 | P00285442 | M | 55+ | 16 | C | 4+ | 0 | 8 | NaN | NaN | 7969 |
# Product_Category_2 and Product_Category_3 are additional category. They could be empty, so we just ignore them.
df.loc[:, :"Product_Category_1"].isna().any()
User_ID False Product_ID False Gender False Age False Occupation False City_Category False Stay_In_Current_City_Years False Marital_Status False Product_Category_1 False dtype: bool
df["Purchase"].isna().any()
False
df.loc[:, :"Product_Category_1"].duplicated().any()
False
Regardless of frequency or amount of purchase, the proportion of male customers is higher than that of female customers.
gender = df.groupby("Gender", as_index=False).agg(
frequency=("User_ID", "count"), total_amt=("Purchase", "sum"))
fig = px.pie(gender, names="Gender", values="frequency",
hole=0.5, title="Purchase Frequency by Gender")
fig.update_traces(textposition='inside', textinfo='percent+label', textfont_size=15)
fig.show()
fig = px.pie(gender, names="Gender", values="total_amt",
hole=0.5, title="Purchase Amount by Gender")
fig.update_traces(textposition='inside', textinfo='percent+label', textfont_size=15)
fig.show()
plt.figure(figsize=(8, 4), dpi=200)
plt.title("Purchased Products Price Distribution")
sns.histplot(df, x="Purchase", hue="Gender", kde=True)
plt.show()
user_buy = df.groupby(["User_ID", "Gender"], as_index=False).agg(
frequency=("Product_ID", "count"), total_amt=("Purchase", "sum"))
user_buy
| User_ID | Gender | frequency | total_amt | |
|---|---|---|---|---|
| 0 | 1000001 | F | 35 | 334093 |
| 1 | 1000002 | M | 77 | 810472 |
| 2 | 1000003 | M | 29 | 341635 |
| 3 | 1000004 | M | 14 | 206468 |
| 4 | 1000005 | M | 106 | 821001 |
| ... | ... | ... | ... | ... |
| 5886 | 1006036 | F | 514 | 4116058 |
| 5887 | 1006037 | F | 122 | 1119538 |
| 5888 | 1006038 | F | 12 | 90034 |
| 5889 | 1006039 | F | 74 | 590319 |
| 5890 | 1006040 | M | 180 | 1653299 |
5891 rows × 4 columns
plt.figure(figsize=(8, 4), dpi=200)
plt.title("customers' Consumption Amount Distribution")
fig = sns.histplot(user_buy, x="total_amt", hue="Gender", kde=True)
fig.set(xlim=(user_buy.total_amt.min(), 6e6))
xlabels = ['{:,.0f}'.format(x) + 'M' for x in fig.get_xticks()/1000000]
fig.set_xticklabels(xlabels)
plt.show()
C:\Users\l5103\AppData\Local\Temp\ipykernel_5992\3550260920.py:6: UserWarning: FixedFormatter should only be used together with FixedLocator
for i in range(5):
order = ["first", "second", "third", "forth", "fifth"]
upper_lmt = np.percentile(user_buy.total_amt, 100-(i*20))
lower_lmt = np.percentile(user_buy.total_amt, 80-(i*20))
print(
f"The consumption amount of the {order[i]} group is between {lower_lmt} and {upper_lmt}")
The consumption amount of the first group is between 1355245.0 and 10536909.0 The consumption amount of the second group is between 698842.0 and 1355245.0 The consumption amount of the third group is between 383455.0 and 698842.0 The consumption amount of the forth group is between 205272.0 and 383455.0 The consumption amount of the fifth group is between 46681.0 and 205272.0
user_buy["consum_group"] = pd.qcut(user_buy.total_amt, 5, labels=[
"first", "second", "third", "forth", "fifth"][::-1])
user_buy["consum_group"]
0 forth
1 second
2 forth
3 forth
4 second
...
5886 first
5887 second
5888 fifth
5889 third
5890 first
Name: consum_group, Length: 5891, dtype: category
Categories (5, object): ['fifth' < 'forth' < 'third' < 'second' < 'first']
Even though the first group of customers doesn't account for 80% of consumption, it still accounts for over 50%. Besides, the consumption of the first group and the second group almost account 80%.
group_consum = user_buy.groupby(
"consum_group", as_index=False).agg({"total_amt": "sum"})
fig = px.pie(group_consum, names="consum_group", values="total_amt", hole=0.5,
title="What percentage of total consumption does each group account for?",
category_orders={"consum_group": ["first", "second", "third", "forth", "fifth"][::-1]})
fig.update_traces(textposition='outside', textinfo='percent+label',textfont_size=15)
fig.show()
On the other hand, the purchase frequency of the first group of customers is the most, too.
fig = px.box(user_buy, x="consum_group", y="frequency", color="consum_group", category_orders={
"consum_group": ["first", "second", "third", "forth", "fifth"][::-1]},
title="How often does each group of customers purchase?")
fig.show()
merged = df[['User_ID', 'Age', 'Occupation', 'City_Category',
'Stay_In_Current_City_Years', 'Marital_Status', ]].drop_duplicates()
users = user_buy.merge(merged, on="User_ID")
users
| User_ID | Gender | frequency | total_amt | consum_group | Age | Occupation | City_Category | Stay_In_Current_City_Years | Marital_Status | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000001 | F | 35 | 334093 | forth | 0-17 | 10 | A | 2 | 0 |
| 1 | 1000002 | M | 77 | 810472 | second | 55+ | 16 | C | 4+ | 0 |
| 2 | 1000003 | M | 29 | 341635 | forth | 26-35 | 15 | A | 3 | 0 |
| 3 | 1000004 | M | 14 | 206468 | forth | 46-50 | 7 | B | 2 | 1 |
| 4 | 1000005 | M | 106 | 821001 | second | 26-35 | 20 | A | 1 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5886 | 1006036 | F | 514 | 4116058 | first | 26-35 | 15 | B | 4+ | 1 |
| 5887 | 1006037 | F | 122 | 1119538 | second | 46-50 | 1 | C | 4+ | 0 |
| 5888 | 1006038 | F | 12 | 90034 | fifth | 55+ | 1 | C | 2 | 0 |
| 5889 | 1006039 | F | 74 | 590319 | third | 46-50 | 0 | B | 4+ | 1 |
| 5890 | 1006040 | M | 180 | 1653299 | first | 26-35 | 6 | B | 2 | 0 |
5891 rows × 10 columns
We can see the age of first group of customers is a little more concentrated in 18 - 45 years old.
age = users.groupby(["consum_group", "Age"], as_index=False).agg(
Count=("User_ID", "count"))
fig = px.bar(age, x="consum_group", y="Count", color="Age",
title="Age Distribution between different groups of customers",)
fig.show()
Whether it's the first chart or the second, it's a little bit hard to figure out a specific pattern.
occupation = users.groupby(["consum_group","Occupation"], as_index=False).agg(Count=("User_ID","count"))
occupation["Occupation"] = occupation["Occupation"].astype("category")
fig=px.bar(occupation,x="Count",y="consum_group",color="Occupation",orientation="h",title="Occupation Distribution between different groups of customers")
fig.show()
top7_ocu = df.Occupation.value_counts().head(7).index
# create a chart which only focuses on the top 7 occupations.
occupation = users[users.Occupation.isin(top7_ocu)].groupby(
["consum_group", "Occupation"], as_index=False).agg(Count=("User_ID", "count"))
occupation["Occupation"] = occupation["Occupation"].astype("category")
fig = px.bar(occupation, x="Count", y="consum_group",
color="Occupation", orientation="h",
title="Occupation Distribution between different groups of customers")
fig.show()
city_ratio = users.City_Category.value_counts().sort_index()
fig = px.pie(city_ratio, names=city_ratio.index, values=city_ratio.values,
hole=0.5, title="Which city do the customers mainly live in?")
fig.update_traces(textinfo='percent+label', sort=False, textfont_size=15)
fig.show()
If we just look at the city where the most customers live, City C seems to be important. However, when we take a look at the difference between each group, we'll find that City B is the city where first group of customers mainly live, and the next is City A.
city = users.groupby(["consum_group", "City_Category"],
as_index=False).agg(Count=("User_ID", "count"))
fig = px.bar(city, x="consum_group", y="Count", color="City_Category",
title="Which city do different groups of customers mainly live in?")
fig.show()
There's no obvious difference between groups.
stay = users.groupby(["consum_group", "Stay_In_Current_City_Years"],
as_index=False).agg(Count=("User_ID", "count"))
fig = px.bar(stay, x="consum_group", y="Count",
color="Stay_In_Current_City_Years",
title="How are the years of staying in current city of different groups of customers?")
fig.show()
There's no obvious difference between groups.
marital_stt = users.groupby(["consum_group", "Marital_Status"], as_index=False).agg(
Count=("User_ID", "count"))
marital_stt.Marital_Status = marital_stt.Marital_Status.astype('category')
fig = px.bar(marital_stt, x="consum_group", y="Count", color="Marital_Status",
title="How's the marital status of different groups of customers?")
fig.show()
products = df.groupby(["Product_ID", "Product_Category_1"], as_index=False).agg(
sales_volume=("Product_ID", "count"), sales_figures=("Purchase", "sum"))
products
| Product_ID | Product_Category_1 | sales_volume | sales_figures | |
|---|---|---|---|---|
| 0 | P00000142 | 3 | 1152 | 12837476 |
| 1 | P00000242 | 2 | 376 | 3967496 |
| 2 | P00000342 | 5 | 244 | 1296475 |
| 3 | P00000442 | 5 | 92 | 441173 |
| 4 | P00000542 | 5 | 149 | 807212 |
| ... | ... | ... | ... | ... |
| 3626 | P0099442 | 6 | 200 | 2870383 |
| 3627 | P0099642 | 8 | 13 | 83710 |
| 3628 | P0099742 | 3 | 126 | 991948 |
| 3629 | P0099842 | 5 | 102 | 737312 |
| 3630 | P0099942 | 5 | 14 | 78019 |
3631 rows × 4 columns
# divide products into 5 groups
products["sales_group"] = pd.qcut(products.sales_figures, 5, labels=[
"first", "second", "third", "forth", "fifth"][::-1])
products
| Product_ID | Product_Category_1 | sales_volume | sales_figures | sales_group | |
|---|---|---|---|---|---|
| 0 | P00000142 | 3 | 1152 | 12837476 | first |
| 1 | P00000242 | 2 | 376 | 3967496 | first |
| 2 | P00000342 | 5 | 244 | 1296475 | second |
| 3 | P00000442 | 5 | 92 | 441173 | third |
| 4 | P00000542 | 5 | 149 | 807212 | second |
| ... | ... | ... | ... | ... | ... |
| 3626 | P0099442 | 6 | 200 | 2870383 | first |
| 3627 | P0099642 | 8 | 13 | 83710 | fifth |
| 3628 | P0099742 | 3 | 126 | 991948 | second |
| 3629 | P0099842 | 5 | 102 | 737312 | second |
| 3630 | P0099942 | 5 | 14 | 78019 | fifth |
3631 rows × 5 columns
As for sales figures of products, the first group accounts for 73.3%, which corresponds the 80/20 rule more than the situation of customers.
group_sales = products.groupby(
"sales_group", as_index=False).agg({"sales_figures": "sum"})
fig = px.pie(group_sales, names="sales_group", values="sales_figures", hole=0.5,
title="What percentage of sales figures does <br>each group of products account for?",
category_orders={"sales_group": ["first", "second", "third", "forth", "fifth"][::-1]})
fig.update_traces(textposition='outside', textinfo='percent+label',textfont_size=15)
fig.show()
The sales volume of the first group are higher than all the other groups.
fig = px.box(products, x="sales_group", y="sales_volume", color="sales_group", category_orders={
"sales_group": ["first", "second", "third", "forth", "fifth"][::-1]},
title="How are the sales volume of each group of products?")
fig.show()
The box plots from the fifth group to the third group of products are somewhat similar and share a same pattern.
products["price"] = products.sales_figures / products.sales_volume
fig = px.box(products, x="sales_group", y="price", color="sales_group", category_orders={
"sales_group": ["first", "second", "third", "forth", "fifth"][::-1]},
title="How are the prices of each group of products?")
fig.show()
This chart combines previous ones. The size of bubbles represents the sales figures of products.
# hover on bubbles to see product ids.
fig = px.scatter(products, x="sales_volume", y="price", size="sales_figures",
color="sales_group", size_max=50,
hover_name="Product_ID",
title="Products Bubble Chart:<br>Combining price, sales volume and sales figures.",
category_orders={"sales_group": ["first", "second", "third", "forth", "fifth"]})
fig.show()
category_ratio = products.Product_Category_1.value_counts().sort_index()
fig = px.pie(names=category_ratio.index, values=category_ratio.values, hole=0.5,
title="What percentage of all products does each category account for?",)
fig.update_traces(textposition='inside', textinfo='percent+label', sort=False)
fig.update_layout(uniformtext_minsize=15, uniformtext_mode='hide')
fig.show()
We can see that the main categories in all products in order are 8, 5 and 1 etc. However, the next plot shows that in the first the order of categories is 1, 5 and 8.
category = products.groupby(["sales_group", "Product_Category_1"]).agg(
Count=("Product_ID", "count")).reset_index()
category["Product_Category_1"] = category["Product_Category_1"].astype(
"category")
fig = px.bar(category, x="Count", y="sales_group",
color="Product_Category_1", orientation="h",
title="What's the main category in different groups of products?")
fig.show()
The first group of categories accounts for 79.1 %. So in the case of categories, the 80/20 rule is quite applicable.
ctgr = products.groupby("Product_Category_1", as_index=False).agg(sales_figures=(
"sales_figures", "sum"), avg_price=("price", "mean"), avg_sales_volume=("sales_volume", "mean"))
ctgr["ctgr_group"] = pd.qcut(ctgr.sales_figures, 5, labels=[
"first", "second", "third", "forth", "fifth"][::-1])
group_ctgr = ctgr.groupby("ctgr_group", as_index=False).agg(
{"sales_figures": "sum"})
fig = px.pie(group_ctgr, names="ctgr_group", values="sales_figures", hole=0.5,
title="What percentage of sales figures does <br>each group of categories account for?",
category_orders={"ctgr_group": ["first", "second", "third", "forth", "fifth"]})
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.update_layout(uniformtext_minsize=15, uniformtext_mode='hide')
fig.show()
This chart shows how those categories that make the highest sales figures are concentrated in particular ones.
fig = px.scatter(ctgr, x="avg_sales_volume", y="avg_price", size="sales_figures",
color="ctgr_group", text="Product_Category_1", size_max=150,
title="Average price and sales volume of products in different Categories",
category_orders={"ctgr_group": ["first", "second", "third", "forth", "fifth"]})
fig.update_traces(textfont_size=16)
# fig.update_layout(uniformtext_minsize=16, uniformtext_mode='hide')
fig.show()
If there's any suggestion, please feel free to leave your comments. And if you think this notebook is useful, you can fork or upvote it.